{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Database preparation\n",
    "\n",
    "### Stop words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop = []\n",
    "file = codecs.open(dir + \"/data/raw/manifestos/french_stopwords.txt\", 'r', encoding='utf-8', errors='ignore') \n",
    "for line in file: \n",
    "    stop.append(line.strip())  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging with key and text cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for year in ['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']: \n",
    "    print(year)\n",
    "    df=pd.read_pickle(dir + '/data/raw/manifestos/df_raw_'+year)\n",
    "    if year!='1997':\n",
    "        df['id_doc']='id'+df['id_doc'] #transformation to merge with stata database\n",
    "    merge=pd.read_stata(dir + '/data/intermediate/key_augmented_'+year+'.dta')\n",
    "    df=pd.merge(df[['id_doc', 'text']], merge, how='left', on='id_doc', indicator=True)\n",
    "    assert sum(df['_merge']!='both')==0\n",
    "    del df['_merge']\n",
    "\n",
    "    cleaning=df['text'].tolist()\n",
    "\n",
    "    #convert to lower case\n",
    "    cleaning = [x.lower() for x in cleaning]\n",
    "\n",
    "    #remove apostrophes\n",
    "    list_char=[\"'\",\"’\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,' ') for word in cleaning] \n",
    "        stop = [word.replace(x,' ') for word in stop] \n",
    "\n",
    "    #remove punctuation \n",
    "    punctuation = [\".\", \"|\", \";\", ':', \",\", '\"', \"!\", '?', '/', '^', '»', '-', '_', '■', '□', '—', '•', '(', ')', '%', '\\'', '*', '#', '“', '”', '[', ']', '{', '}', '@', '>', '<', '$', '&', '+', '£', '«', '❖', '°', 'æ', '►', '®', '„', '‘', '=', '\\\\']\n",
    "    for x in punctuation: \n",
    "         cleaning = [y.replace(x,'') for y in cleaning]\n",
    "\n",
    "    #remove numbers\n",
    "    numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']\n",
    "    for x in numbers:\n",
    "         cleaning=[y.replace(x,'') for y in cleaning] \n",
    "            \n",
    "    #remove special characters\n",
    "    numbers = ['\\x93', '\\x92', '\\x94', '\\x91', '\\x80']\n",
    "    for x in numbers:\n",
    "         cleaning=[y.replace(x,'') for y in cleaning]         \n",
    "        \n",
    "    #Remove accents \n",
    "    list_char=[\"é\", \"è\", \"ë\", \"ê\", 'ê']\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'e') for word in cleaning]    \n",
    "        stop = [word.replace(x,'e') for word in stop]\n",
    "\n",
    "    list_char=[\"â\", \"ä\", \"à\", \"á\", \"ã\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'a') for word in cleaning]\n",
    "        stop = [word.replace(x,'a') for word in stop]\n",
    "\n",
    "    list_char=[\"û\", \"ü\", \"ù\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'u') for word in cleaning]\n",
    "        stop = [word.replace(x,'u') for word in stop]\n",
    "\n",
    "    list_char=[\"ö\", \"ô\", \"ó\", 'œ']\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'o') for word in cleaning]\n",
    "        stop = [word.replace(x,'o') for word in stop]\n",
    "\n",
    "    list_char=[\"î\", \"ï\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'i') for word in cleaning]\n",
    "        stop = [word.replace(x,'i') for word in stop]\n",
    "\n",
    "    list_char=[\"ç\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'c') for word in cleaning]\n",
    "        stop = [word.replace(x,'c') for word in stop]\n",
    "\n",
    "    list_char=[\"ÿ\"]\n",
    "    for x in list_char:\n",
    "        cleaning = [word.replace(x,'y') for word in cleaning]\n",
    "        stop = [word.replace(x,'y') for word in stop]\n",
    "\n",
    "    #remove stop words\n",
    "    cleaning = [' '.join(w for w in element.split() if w not in stop or w=='cher') for element in cleaning]\n",
    "    #remove extra spaces after removing stopwords\n",
    "    cleaning = [x.strip() for x in cleaning]\n",
    "\n",
    "    df=df.join(pd.DataFrame({'text_clean': cleaning}))\n",
    "    df.to_pickle(dir+\"/data/intermediate/df_\"+year)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['text_clean'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
